# import 'Pandas'
import pandas as pd
# import 'Numpy'
import numpy as np
# import subpackage of 'Matplotlib'
import matplotlib.pyplot as plt
# import 'Seaborn'
import seaborn as sns
# import 'plotly.express'
import plotly.express as pltx
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')
# import scipy.stats
import scipy.stats as st
# To load and read the dataset
df = sns.load_dataset('tips')
# To get the top 5 data
df.head()
| total_bill | tip | sex | smoker | day | time | size | |
|---|---|---|---|---|---|---|---|
| 0 | 16.99 | 1.01 | Female | No | Sun | Dinner | 2 |
| 1 | 10.34 | 1.66 | Male | No | Sun | Dinner | 3 |
| 2 | 21.01 | 3.50 | Male | No | Sun | Dinner | 3 |
| 3 | 23.68 | 3.31 | Male | No | Sun | Dinner | 2 |
| 4 | 24.59 | 3.61 | Female | No | Sun | Dinner | 4 |
# To get the number of rows and columns
df.shape
(244, 7)
# To get the datatypes of diffrent columns and also the number of null values
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244 entries, 0 to 243 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 total_bill 244 non-null float64 1 tip 244 non-null float64 2 sex 244 non-null category 3 smoker 244 non-null category 4 day 244 non-null category 5 time 244 non-null category 6 size 244 non-null int64 dtypes: category(4), float64(2), int64(1) memory usage: 7.4 KB
# To get the number of null values
df.isnull().sum()
total_bill 0 tip 0 sex 0 smoker 0 day 0 time 0 size 0 dtype: int64
# To get the types of datatypes
df.dtypes
total_bill float64 tip float64 sex category smoker category day category time category size int64 dtype: object
* there are 3 numerical data and 4 categorical data
# value_counts is used to describe the categorical data
df.sex.value_counts()
sex Male 157 Female 87 Name: count, dtype: int64
# To calculate the percentage
df.sex.value_counts(normalize=True)*100
sex Male 64.344262 Female 35.655738 Name: proportion, dtype: float64
# To round off the decimal places
round(df.sex.value_counts(normalize=True)*100,0)
sex Male 64.0 Female 36.0 Name: proportion, dtype: float64
It indicates that 64% of the transactions are done by Males and 36% are done by Females
Find out the % of transactions for lunch and dinner
round(df.time.value_counts(normalize=True)*100,0)
time Dinner 72.0 Lunch 28.0 Name: proportion, dtype: float64
It indicates that 72% transactions are for dinner and 28% are for lunch
# To visualise in bar chart
sns.countplot(x=df.sex)
<Axes: xlabel='sex', ylabel='count'>
sns.countplot(x=df.time)
<Axes: xlabel='time', ylabel='count'>
df.sex.value_counts().plot(kind='pie')
<Axes: ylabel='count'>
df.time.value_counts().plot(kind='pie')
<Axes: ylabel='count'>
Measures of Location
# To sort all the values
df.total_bill.sort_values()
67 3.07
92 5.75
111 7.25
172 7.25
149 7.51
...
182 45.35
156 48.17
59 48.27
212 48.33
170 50.81
Name: total_bill, Length: 244, dtype: float64
# To calculate Q1 or 25 percaentile or the values less than 25%
df.total_bill.quantile(0.25)
13.3475
# To get q2
df.total_bill.quantile(0.5)
17.795
# to get q3
df.total_bill.quantile(0.75)
24.127499999999998
# to get q4
df.total_bill.quantile(1)
50.81
# To get q0
df.total_bill.quantile(0)
3.07
# To get fivepoint summary in only one command
df.total_bill.describe()
count 244.000000 mean 19.785943 std 8.902412 min 3.070000 25% 13.347500 50% 17.795000 75% 24.127500 max 50.810000 Name: total_bill, dtype: float64
# To see the fivepoint summary visually
sns.boxplot(y=df.total_bill)
<Axes: ylabel='total_bill'>
# To get an interactive boxplot
import plotly.express as pltx
pltx.box(df.total_bill)
Boxplot describes us fivepoint summary not measures of central tendency
# To study five point summary as well as identifying outliers
pltx.box(df.tip)
# Mean
df.total_bill.mean()
19.78594262295082
# Median
df.total_bill.median()
17.795
# Mode
df.total_bill.mode()
0 13.42 Name: total_bill, dtype: float64
# Range
df.total_bill.max() - df.total_bill.min()
47.74
# variance (MSD)
df.total_bill.var()
79.25293861397826
# standard deviation (RMSD)
df.total_bill.std()
8.902411954856856
np.sqrt(df.total_bill.var())
8.902411954856856
# To clacu;late skewness
df.total_bill.skew()
1.1332130376158205
# To get the skewness chart along with histogram
sns.distplot(df.total_bill)
<Axes: xlabel='total_bill', ylabel='Density'>
# To get the skewness chart without histogram
sns.distplot(df.total_bill,hist=False)
<Axes: xlabel='total_bill', ylabel='Density'>
When we have right skewed data mean will be greater than median
df.total_bill.describe()
count 244.000000 mean 19.785943 std 8.902412 min 3.070000 25% 13.347500 50% 17.795000 75% 24.127500 max 50.810000 Name: total_bill, dtype: float64
df.tip.skew()
1.4654510370979401
sns.distplot(df.tip,hist=False)
<Axes: xlabel='tip', ylabel='Density'>
df.tip.describe()
count 244.000000 mean 2.998279 std 1.383638 min 1.000000 25% 2.000000 50% 2.900000 75% 3.562500 max 10.000000 Name: tip, dtype: float64
df.total_bill.kurt()
1.2184840156638854
df.tip.kurt()
3.648375873352852
df.total_bill.describe()
count 244.000000 mean 19.785943 std 8.902412 min 3.070000 25% 13.347500 50% 17.795000 75% 24.127500 max 50.810000 Name: total_bill, dtype: float64
df.tip.describe()
count 244.000000 mean 2.998279 std 1.383638 min 1.000000 25% 2.000000 50% 2.900000 75% 3.562500 max 10.000000 Name: tip, dtype: float64
It seems like you're encountering the same issue. The error is occurring because the cov() method is trying to convert the entire DataFrame to a numeric type, and it's failing due to the presence of non-numeric values, specifically the string 'No'.
To address this issue, you can manually convert the non-numeric columns to numeric or handle them separately. Here's an example:
python Copy code import pandas as pd import numpy as np
for col in df.columns: if df[col].dtype == 'object': df[col] = pd.to_numeric(df[col], errors='coerce')
cov_matrix = df.cov()
print(cov_matrix) This code iterates through each column in the DataFrame and uses pd.to_numeric to convert columns with 'object' dtype (typically strings) to numeric values. The errors='coerce' parameter replaces any non-numeric values with NaN. After this conversion, you should be able to calculate the covariance without encountering the 'could not convert string to float' error.
Make sure to check and handle the NaN values in the resulting covariance matrix as needed for your analysis.
sns.scatterplot(x='total_bill', y='tip', data=df)
plt.show()
sns.scatterplot(y='total_bill',x='size',data=df)
plt.show()
sns.scatterplot(x='size',y='tip',data=df)
plt.show()
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x1b568968b10>